#This file generates border_tiles.RData used for country-year analysis
rm(list=ls())

library(openxlsx)
library(ggplot2)
library(ggthemes)
library(reshape2)
library(tokenizers)
library(dplyr)
library(tidyverse)
library(countrycode)
library(stringr)
library(survival)
library(zoo)
library(readr)
library(readstata13)
library(ggthemes)
library(tidyr)
library(sp)
library(plyr)
library(foreign)
library(bife)
library(estprod)


recode_countries <- function(vec, scheme_1, scheme_2){
  # new values come first
  recode_vals <- as.character(unique(countryref[countryref$type == 'country', scheme_2]))
  names(recode_vals) <- as.character(unique(countryref[countryref$type == 'country', scheme_1]))
  
  recode_vals <- recode_vals[!is.na(names(recode_vals))]
  
  
  vec <- recode(vec, !!!recode_vals)
  return(vec)
}

#################################################################
# DVs = who focuses on border issues, and sentiment WRT borders #
#################################################################
undergrad_predicted <- read.csv('kw_relevance_predicted_set.csv', stringsAsFactors = FALSE)
full_unga_speeches <- read.csv('UNGAplenary_tiles_v5.csv', stringsAsFactors = FALSE)
borders_unga_speeches <- read.csv('keyword_filtered_tiles.csv', stringsAsFactors = FALSE)

# filter non-relevant undergrad speeches
undergrad_predicted <- undergrad_predicted %>% select(-starts_with('x')) %>% filter(Predicted == 1)

full_unga_speeches$n_words <- count_words(full_unga_speeches$text)
borders_unga_speeches$n_words <- count_words(borders_unga_speeches$text)

# Mention DV
border_tiles <- full_unga_speeches %>% group_by(country, year) %>% dplyr::summarise(tiles_overall = n())
border_tiles <- borders_unga_speeches %>% group_by(country, year) %>% dplyr::summarise(tiles_borders = n()) %>% right_join(border_tiles)

border_tiles <- full_unga_speeches %>% group_by(country, year) %>% dplyr::summarise(words_overall = sum(n_words)) %>% right_join(border_tiles)
border_tiles <- borders_unga_speeches %>% group_by(country, year) %>% dplyr::summarise(words_borders = sum(n_words)) %>% right_join(border_tiles)

# added undergrad DV
border_tiles <- undergrad_predicted %>% group_by(country, year) %>% dplyr::summarise(tiles_borders_undergrads = n()) %>% right_join(border_tiles)

border_tiles <- replace_na(border_tiles, list(tiles_borders = 0, words_borders = 0, tiles_borders_undergrads = 0))

sum(border_tiles$tiles_borders_undergrads > 0)

border_tiles$prop_tiles <- border_tiles$tiles_borders/border_tiles$tiles_overall
border_tiles$prop_words <- border_tiles$words_borders/border_tiles$words_overall

# drop EC, EU
border_tiles <- border_tiles %>% filter(!country %in% c('EC', 'EU'))

# add missing obs back to 1967 for all countries, for matching purposes down the line
countries <- unique(border_tiles$country)
years <- 1967:2017
df_1970 <- data.frame(matrix(NA, nrow=length(countries)*length(years), ncol=3))
names(df_1970) <- c('country', 'year', 'null')
df_1970$country <- sort(rep(countries, length(years)))
df_1970$year <- rep(years, length(countries))

border_tiles <- full_join(border_tiles, df_1970, by=c('country', 'year')) %>% select(-null)

rm(df_1970)

####################
# Sentiment DV     #
####################
load('sentiment_tiles.RData')
sentiment <- tiles
rm(tiles)

border_tiles <- sentiment %>% group_by(year, country) %>% dplyr::summarise(mean_predicted_sentiment = mean(predicted_sentiment)) %>% 
  select(country, year, mean_predicted_sentiment) %>% right_join(border_tiles)

border_tiles <- border_tiles[order(border_tiles$year),]
border_tiles <- border_tiles[order(border_tiles$country),]

rm(sentiment)

border_tiles <- border_tiles %>% mutate(mean_predicted_sentiment = ifelse(tiles_borders_undergrads == 0,
                                                                          NA, mean_predicted_sentiment))

# anxiety
anxiety <- read_csv('country_anxiety.csv')
border_tiles <- left_join(border_tiles, anxiety, by =c('country', 'year'))

# anger
anger <- read_csv('country_anger.csv')
border_tiles <- left_join(border_tiles, anger, by =c('country', 'year'))

# lags
border_tiles$lagged_sentiment <- panel_lag(border_tiles$mean_predicted_sentiment, id=border_tiles$country, time=border_tiles$year, lag=1)
border_tiles$lagged_anxiety <- panel_lag(border_tiles$mean_anxiety, id=border_tiles$country, time=border_tiles$year, lag=1)

###########
# Controls#
###########
# GDP, Population
world_bank <- read.csv('C:/Users/Siyao/Dropbox/Borders - Sentiments/data/world_bank.csv', na.strings = '..')
names(world_bank)[1] <- 'Series.Name'
world_bank <- world_bank %>% select(matches('X1|X2|Series\\.Name|Country\\.Code')) %>% 
  filter(Series.Name %in% unique(Series.Name)[1:51]) %>%
  gather(key="year", value="value", X1970..YR1970.:X2017..YR2017.) %>%
  spread(Series.Name, value) %>% 
  mutate(year=as.numeric(str_extract(year, '[0-9]{4}'))) %>%
  select(year, country=Country.Code, GDP = "GDP per capita (current US$)", Population = "Population, total")

# deal with CZE/CSK
world_bank <- rbind(world_bank,
                    data.frame('country' = 'CSK',
                               'year'=1970:1992,
                               'GDP' = world_bank$GDP[world_bank$country == 'CZE' & world_bank$year %in% 1970:1992] + 
                                 world_bank$GDP[world_bank$country == 'SLV' & world_bank$year %in% 1970:1992],
                               'Population' = world_bank$Population[world_bank$country == 'CZE' & world_bank$year %in% 1970:1992] + 
                                 world_bank$Population[world_bank$country == 'SLV' & world_bank$year %in% 1970:1992]))

# polity
cown_iso3 <- c('260' = 'DEU', '265' = 'DDR', '315' = 'CSK', '345' = 'YUG', '347' = 'UNK', '678' = 'YEM', '680' = 'YDM', '816' = 'DVM')
polity <- read.csv('polity.csv')
polity <- polity %>% select(ccode, year, polity2)
polity$country <- countrycode(as.character(polity$ccode), 'cown', 'iso3c', nomatch=NULL)
polity <- polity %>% select(-ccode)
# 678, 680, 730, 769, 817, 818
polity$country <- dplyr::recode(polity$country, !!!cown_iso3)


# major interstate disputes
disputes <- read.csv('interstate_violence.csv')
disputes <- disputes %>% select(country, year, inttot, civtot) %>% dplyr::rename(interstate_disputes = inttot, civil_disputes = civtot)
disputes$country <- countrycode(disputes$country, 'country.name', 'iso3c', nomatch=NULL)

# globalization
globalization <- read_csv('GLOBALIZATION_Data_2018.csv')
globalization <- globalization %>% dplyr::rename('Economic_Globalization' = 'KOFEcGIdf',
                                                 'Political_Globalization' = 'KOFPoGIdf',
                                                 'State_ISO' = 'code') %>%
  select(State_ISO, year, Economic_Globalization, Political_Globalization)

# 5-year net migration
migration_5year <- read_csv('API_SM.POP.NETM_DS2_en_csv_v2_103709.csv')
migration_5year <- migration_5year %>% select(-`Indicator Name`, -`Indicator Code`, -`Country Name`) %>% melt(id.vars="Country Code") %>%
  plyr::rename(c(variable = 'year', `Country Code` = 'country', value = 'Net_Migration')) %>%
  mutate(year = as.numeric(as.character(year)))

###########
# MERGING #
###########
border_tiles <- border_tiles %>% left_join(world_bank, by=c('year', 'country'))
border_tiles <- border_tiles %>% left_join(polity %>% dplyr::distinct(), by=c('year',  'country'))
border_tiles <- border_tiles %>% left_join(disputes %>% select(country, year, civil_disputes, interstate_disputes), by=c('year', 'country')) %>% 
  dplyr::rename(civil_disputes_1 = civil_disputes)
border_tiles <- left_join(border_tiles, migration_5year, by=c('country', 'year'))

border_tiles <- dplyr::rename(border_tiles,
                              Polity = polity2,
                              Civil_Disputes = civil_disputes_1,
                              Interstate_Disputes = interstate_disputes)

border_tiles <- left_join(border_tiles, globalization, by=c('country'='State_ISO', 'year'))

#SAVE THE DATASET
save(border_tiles, file="border_tiles.RData")
